Load required Package
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(brms)
## Loading required package: Rcpp
## Loading 'brms' package (version 2.20.4). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
##
## Attaching package: 'brms'
## The following object is masked from 'package:stats':
##
## ar
library(synthpop)
## Find out more at https://www.synthpop.org.uk/
library(corrplot)
## corrplot 0.92 loaded
library(loo)
## This is loo version 2.6.0
## - Online documentation and vignettes at mc-stan.org/loo
## - As of v2.0.0 loo defaults to 1 core but we recommend using as many as possible. Use the 'cores' argument or set options(mc.cores = NUM_CORES) for an entire session.
##
## Attaching package: 'loo'
## The following object is masked from 'package:synthpop':
##
## compare
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:brms':
##
## rwiener
library(bayesplot)
## This is bayesplot version 1.10.0
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
##
## Attaching package: 'bayesplot'
## The following object is masked from 'package:brms':
##
## rhat
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Load original dataset
og_data<- read.csv("cirrhosis.csv")
head(og_data)
ID <int> | N_Days <int> | Status <chr> | Drug <chr> | Age <int> | Sex <chr> | Ascites <chr> | Hepatomegaly <chr> | Spiders <chr> | ||
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 400 | D | D-penicillamine | 21464 | F | Y | Y | Y | |
| 2 | 2 | 4500 | C | D-penicillamine | 20617 | F | N | Y | Y | |
| 3 | 3 | 1012 | D | D-penicillamine | 25594 | M | N | N | N | |
| 4 | 4 | 1925 | D | D-penicillamine | 19994 | F | N | Y | Y | |
| 5 | 5 | 1504 | CL | Placebo | 13918 | F | N | Y | Y | |
| 6 | 6 | 2503 | D | Placebo | 24201 | F | N | Y | N |
Synyhethic data generation
myseed<-1285
synthetic_data<-syn(og_data, method = "cart", visit.sequence = (1:ncol(og_data)), m=1, k=5000,seed = myseed)
## Sample(s) of size 5000 will be generated from original data of size 418.
##
##
## Variable(s): Status, Drug, Sex, Ascites, Hepatomegaly, Spiders, Edema have been changed for synthesis from character to factor.
## Warning: In your synthesis there are numeric variables with 5 or fewer levels: Stage.
## Consider changing them to factors. You can do it using parameter 'minnumlevels'.
##
## Synthesis
## -----------
## ID N_Days Status Drug Age Sex Ascites Hepatomegaly Spiders Edema
## Bilirubin Cholesterol Albumin Copper Alk_Phos SGOT Tryglicerides Platelets Prothrombin Stage
synthethic_data<-as.data.frame(synthetic_data$syn)
head(synthethic_data)
ID <int> | N_Days <dbl> | Status <chr> | Drug <chr> | Age <dbl> | Sex <chr> | Ascites <chr> | Hepatomegaly <chr> | Spiders <chr> | ||
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 395 | 193 | D | NA | 17897 | F | NA | NA | NA | |
| 2 | 84 | 2540 | D | D-penicillamine | 19544 | F | N | N | N | |
| 3 | 146 | 762 | D | Placebo | 24803 | M | N | Y | N | |
| 4 | 117 | 515 | D | D-penicillamine | 19577 | M | N | Y | N | |
| 5 | 396 | 1328 | C | NA | 22111 | F | NA | NA | NA | |
| 6 | 170 | 2713 | C | Placebo | 17809 | F | Y | Y | N |
Synthetic data evalutaion
create_unique_values_table <- function(data) {
table <- data.frame(Features = character(), Unique_Values = numeric(), Mean = numeric(), Skewness = numeric(), Kurtosis = numeric(), Max_Category = character(), stringsAsFactors = TRUE)
for (col in colnames(data)) {
if (is.numeric(data[[col]])) {
n_unique <- length(unique(data[[col]]))
mean_val <- mean(data[[col]], na.rm = TRUE)
skew_val <- skewness(data[[col]], na.rm = TRUE)
kurt_val <- kurtosis(data[[col]], na.rm = TRUE)
max_category <- NA
} else {
n_unique <- length(unique(data[[col]]))
mean_val <- NA
skew_val <- NA
kurt_val <- NA
max_category <- names(sort(table(data[[col]]), decreasing = TRUE)[1])
}
table <- rbind(table, data.frame(Features = col, Unique_Values = n_unique, Mean = mean_val, Skewness = skew_val, Kurtosis = kurt_val, Max_Category = max_category))
}
return(table)
}
original_table <- create_unique_values_table(og_data)
synthetic_table <- create_unique_values_table(synthethic_data)
og_unique <- original_table$Unique_Values
og_mean <- original_table$Mean
og_skew <- original_table$Skewness
og_kurt <- original_table$Kurtosis
og_max <- original_table$Max_Category
syn_unique <- synthetic_table$Unique_Values
syn_mean <- synthetic_table$Mean
syn_skew <- synthetic_table$Skewness
syn_kurt <- synthetic_table$Kurtosis
syn_max <- synthetic_table$Max_Category
feature <- original_table$Features
comparison_df <- data.frame(
Feature = feature,
Original_Unique_Values = og_unique,
Synthetic_Unique_Values = syn_unique,
Original_Mean = og_mean,
Synthetic_Mean = syn_mean,
Original_Skewness = og_skew,
Synthetic_Skewness = syn_skew,
Original_Kurtosis = og_kurt,
Synthetic_Kurtosis = syn_kurt,
Original_Max_Category = og_max,
Synthetic_Max_Category = syn_max
)
print(comparison_df)
## Feature Original_Unique_Values Synthetic_Unique_Values Original_Mean
## 1 ID 418 418 209.500000
## 2 N_Days 399 399 1917.782297
## 3 Status 3 3 NA
## 4 Drug 3 3 NA
## 5 Age 344 344 18533.351675
## 6 Sex 2 2 NA
## 7 Ascites 3 3 NA
## 8 Hepatomegaly 3 3 NA
## 9 Spiders 3 3 NA
## 10 Edema 3 3 NA
## 11 Bilirubin 98 98 3.220813
## 12 Cholesterol 202 202 369.510563
## 13 Albumin 154 154 3.497440
## 14 Copper 159 159 97.648387
## 15 Alk_Phos 296 296 1982.655769
## 16 SGOT 180 180 122.556346
## 17 Tryglicerides 147 147 124.702128
## 18 Platelets 244 244 257.024570
## 19 Prothrombin 49 49 10.731731
## 20 Stage 5 5 3.024272
## Synthetic_Mean Original_Skewness Synthetic_Skewness Original_Kurtosis
## 1 208.732800 0.00000000 0.009417929 -1.2086158
## 2 1890.130400 0.46921558 0.511656340 -0.5027025
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 18623.209200 0.08622782 0.114668567 -0.6350537
## 6 NA NA NA NA
## 7 NA NA NA NA
## 8 NA NA NA NA
## 9 NA NA NA NA
## 10 NA NA NA NA
## 11 3.409260 2.69813743 2.572163421 7.9025102
## 12 393.721673 3.37260482 3.025709201 13.9456623
## 13 3.489242 -0.46417641 -0.488826445 0.5287235
## 14 97.788425 2.28139465 2.420272874 7.4147980
## 15 2042.974659 2.96411855 2.834845920 9.4092979
## 16 125.905795 1.43529211 1.503921484 4.1777801
## 17 130.383963 2.49711592 2.440358524 11.4701457
## 18 257.449539 0.62248299 0.617158166 0.8189369
## 19 10.765287 2.20726861 2.260816276 9.8441333
## 20 3.066775 -0.49266562 -0.534760694 -0.6565785
## Synthetic_Kurtosis Original_Max_Category Synthetic_Max_Category
## 1 -1.1988904 <NA> <NA>
## 2 -0.4402123 <NA> <NA>
## 3 NA C C
## 4 NA D-penicillamine D-penicillamine
## 5 -0.6610101 <NA> <NA>
## 6 NA F F
## 7 NA N N
## 8 NA Y Y
## 9 NA N N
## 10 NA N N
## 11 6.9864052 <NA> <NA>
## 12 10.2915316 <NA> <NA>
## 13 0.5657029 <NA> <NA>
## 14 8.6058074 <NA> <NA>
## 15 8.5054136 <NA> <NA>
## 16 4.4141960 <NA> <NA>
## 17 10.9591418 <NA> <NA>
## 18 0.9213549 <NA> <NA>
## 19 9.3532839 <NA> <NA>
## 20 -0.5788484 <NA> <NA>
Now we wiil consider synthetic data as main and wiil try to gain insights about the feature on this data, and then we will perform train and test split and evaluate model on the test data.
Now we have to pre-process data(One hot encoding of categorical variable and some new feature generation)
og_data <- og_data %>%
mutate(Status = recode(Status, "D" = 1, "C" = 0, "CL" = 2),
Status = as.numeric(Status))
# ONE HOT ENCODING OF CATAGORICAL VARIABLE
og_data$Age <- og_data$Age /365.25
og_data$Sex <- as.integer(og_data$Sex == "M")
og_data$Drug <- as.integer(og_data$Drug == "D-penicillamine")
og_data$Edema <- as.integer(og_data$Edema == 'Y')
og_data <- subset(og_data, select = -ID)
og_data <- og_data[order(og_data$Drug), ]
og_data$Ascites <- as.integer(og_data$Ascites == 'Y')
og_data$Spiders <- as.integer(og_data$Spiders == 'Y')
og_data$Hepatomegaly <- as.integer(og_data$Hepatomegaly == 'Y')
head(og_data)
N_Days <int> | Status <dbl> | Drug <int> | Age <dbl> | Sex <int> | Ascites <int> | Hepatomegaly <int> | Spiders <int> | Edema <int> | ||
|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 1504 | 2 | 0 | 38.10541 | 0 | 0 | 1 | 1 | 0 | |
| 6 | 2503 | 1 | 0 | 66.25873 | 0 | 0 | 1 | 0 | 0 | |
| 7 | 1832 | 0 | 0 | 55.53457 | 0 | 0 | 1 | 0 | 0 | |
| 8 | 2466 | 1 | 0 | 53.05681 | 0 | 0 | 0 | 0 | 0 | |
| 10 | 51 | 1 | 0 | 70.55989 | 0 | 1 | 0 | 1 | 1 | |
| 11 | 3762 | 1 | 0 | 53.71389 | 0 | 0 | 1 | 1 | 0 |
#feature generation
og_data<-na.omit(og_data)
og_data$Age <- round(og_data$Age)
threshold_platelets <- 150
og_data$thrombocytopenia <- ifelse(og_data$Platelets < threshold_platelets, 1, 0)
threshold_alk_phos_upper <- 147 # Upper limit of normal range
threshold_alk_phos_lower <- 44 # Lower limit of normal range
og_data$elevated_alk_phos <- ifelse(og_data$Alk_Phos > threshold_alk_phos_upper | og_data$Alk_Phos < threshold_alk_phos_lower, 1, 0)
normal_copper_range <- c(62, 140)
og_data$normal_copper <- ifelse(og_data$Copper >= normal_copper_range[1] & og_data$Copper <= normal_copper_range[2], 1, 0)
normal_albumin_range <- c(3.4, 5.4)
og_data$normal_albumin <- ifelse(og_data$Albumin >= normal_albumin_range[1] & og_data$Albumin <= normal_albumin_range[2], 1, 0)
og_data$DiagnosisDays <- og_data$Age - og_data$N_Days
og_data$Age_Group <- cut(og_data$Age, breaks = c(19, 29, 49, 64, 99), labels = c(0, 1, 2, 3))
og_data$Age_Group <- as.integer(as.character(og_data$Age_Group))
og_data$Symptom_Score <- rowSums(og_data[, c("Ascites", "Hepatomegaly", "Spiders")])
og_data$Liver_Function_Index <- rowMeans(og_data[, c("Bilirubin", "Albumin", "Alk_Phos", "SGOT")])
og_data$Bilirubin_Albumin<-og_data$Bilirubin * og_data$Albumin
og_data$Risk_Score <- og_data$Bilirubin + og_data$Albumin - og_data$Alk_Phos
og_data$Diag_Month <- as.integer((og_data$N_Days %% 365) / 30)
og_data$Diag_Year <- as.integer(og_data$N_Days / 365)
og_data$N_Days <- og_data$N_Days / 365.25
head(og_data)
N_Days <dbl> | Status <dbl> | Drug <int> | Age <dbl> | Sex <int> | Ascites <int> | Hepatomegaly <int> | Spiders <int> | Edema <int> | ||
|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 4.1177276 | 2 | 0 | 38 | 0 | 0 | 1 | 1 | 0 | |
| 7 | 5.0157426 | 0 | 0 | 56 | 0 | 0 | 1 | 0 | 0 | |
| 8 | 6.7515400 | 1 | 0 | 53 | 0 | 0 | 0 | 0 | 0 | |
| 10 | 0.1396304 | 1 | 0 | 71 | 0 | 1 | 0 | 1 | 1 | |
| 11 | 10.2997947 | 1 | 0 | 54 | 0 | 0 | 1 | 1 | 0 | |
| 12 | 0.8323066 | 1 | 0 | 59 | 0 | 0 | 0 | 1 | 0 |
Explanatory Analysis(Distribution for each class)
library(ggplot2)
numeric_cols <- c('N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin',
'Risk_Score', 'Liver_Function_Index', 'DiagnosisDays', 'Bilirubin_Albumin',
'Diag_Year', 'Diag_Month')
train_to_scale <- og_data[, numeric_cols]
ultra_light_colors <- c("#F0F8FF", "#F6F6F6", "#F0FFF0", "#FAFAD2", "#FFE4E1",
"#FFF5EE", "#F5FFFA", "#F0FFFF", "#FFFAF0", "#F8F8FF")
col_per_class <- function(og_data, col) {
ggplot(og_data, aes(x = factor(Status), y = .data[[col]], fill = factor(Status))) +
geom_violin(trim = FALSE) +
geom_boxplot(width = 0.1, outlier.shape = NA, fill = "white", color = "black") +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "black") +
labs(title = paste("Distribution for", col, "for each class")) +
scale_fill_manual(values = ultra_light_colors) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
}
plots <- lapply(numeric_cols, function(col) col_per_class(og_data, col))
plots
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
According to the similarity , we have come to know that C and Cl are
close enough. so we decide to merge c=0, cl=2 to 0 so that we will have
2 categories from which we will make predictions.
og_data <- og_data %>%
mutate(Status = recode(Status,"0"="0", "1" = "1", "2" = "0"))
Standerdization(to make all feature in same scale)
threshold <- 5
group_variables <- names(og_data)[sapply(og_data, function(x) length(unique(x))) < threshold]
og_data[group_variables] <- lapply(og_data[group_variables], factor)
og_data$elevated_alk_phos<- as.numeric(og_data$elevated_alk_phos)
non_group_variables <- setdiff(colnames(og_data), group_variables)
og_data[non_group_variables] <- scale(og_data[non_group_variables])
Check the correlation matrix and exclude the highly correlated variable to tackle the over fitting tendency
correlation_matrix <- cor(og_data[non_group_variables], use = "pairwise.complete.obs")
corrplot(correlation_matrix, method = "color", type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
Columns to drop due to correlation
columns_to_drop <- c('DiagnosisDays', 'Bilirubin_Albumin','Diag_Year', 'Risk_Score', 'Liver_Function_Index')
og_data <- og_data[, !names(og_data) %in% columns_to_drop]
Our final data set
head(og_data)
N_Days <dbl> | Status <fct> | Drug <fct> | Age <dbl> | Sex <fct> | Ascites <fct> | Hepatomegaly <fct> | Spiders <fct> | Edema <fct> | ||
|---|---|---|---|---|---|---|---|---|---|---|
| 5 | -0.4271621 | 0 | 0 | -1.1199214 | 0 | 0 | 1 | 1 | 0 | |
| 7 | -0.1322989 | 0 | 0 | 0.5888530 | 0 | 0 | 1 | 0 | 0 | |
| 8 | 0.4376501 | 1 | 0 | 0.3040573 | 0 | 0 | 0 | 0 | 0 | |
| 10 | -1.7333700 | 1 | 0 | 2.0128317 | 0 | 1 | 0 | 1 | 1 | |
| 11 | 1.6027193 | 1 | 0 | 0.3989892 | 0 | 0 | 1 | 1 | 0 | |
| 12 | -1.5059298 | 1 | 0 | 0.8736488 | 0 | 0 | 0 | 1 | 0 |
Train and Test split of final dataset
set.seed(12348)
index <- createDataPartition(og_data$Status, p = 0.7, list = FALSE)
train_data <- og_data[index, ]
test_data <- og_data[-index, ]
Fit Bayesian logistic regression model with all features and considering group level effects(mega_model)
Model1 <- brm(Status ~ . + (1|Age_Group) + (1|Stage) + (1| Symptom_Score),
data = train_data,
family = bernoulli(link = "logit"),
prior = c(set_prior("student_t(4, 1, 2)", class = "b"),
set_prior("normal(-0.5,0.25)", coef = "Drug1", class = "b")),
control = list(adapt_delta = 0.98),
iter = 4000, warmup = 2000)
## Compiling Stan program...
## Start sampling
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 0.00011 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.1 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 1: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 1: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 1: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 1: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 1: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 1: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 1: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 1: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 1: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 1: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 1: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 4.471 seconds (Warm-up)
## Chain 1: 4.032 seconds (Sampling)
## Chain 1: 8.503 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 2.2e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.22 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 2: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 2: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 2: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 2: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 2: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 2: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 2: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 2: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 2: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 2: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 2: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 4.418 seconds (Warm-up)
## Chain 2: 4.001 seconds (Sampling)
## Chain 2: 8.419 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 2e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.2 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 3: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 3: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 3: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 3: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 3: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 3: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 3: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 3: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 3: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 3: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 3: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 4.663 seconds (Warm-up)
## Chain 3: 4.24 seconds (Sampling)
## Chain 3: 8.903 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 2e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.2 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 4: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 4: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 4: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 4: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 4: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 4: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 4: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 4: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 4: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 4: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 4: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 5.1 seconds (Warm-up)
## Chain 4: 3.964 seconds (Sampling)
## Chain 4: 9.064 seconds (Total)
## Chain 4:
summary(Model1)
## Family: bernoulli
## Links: mu = logit
## Formula: Status ~ N_Days + Drug + Age + Sex + Ascites + Hepatomegaly + Spiders + Edema + Bilirubin + Cholesterol + Albumin + Copper + Alk_Phos + SGOT + Tryglicerides + Platelets + Prothrombin + Stage + thrombocytopenia + elevated_alk_phos + normal_copper + normal_albumin + Age_Group + Symptom_Score + Diag_Month + (1 | Age_Group) + (1 | Stage) + (1 | Symptom_Score)
## Data: train_data (Number of observations: 194)
## Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
## total post-warmup draws = 8000
##
## Group-Level Effects:
## ~Age_Group (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.42 1.20 0.06 4.50 1.00 5711 4428
##
## ~Stage (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.36 1.16 0.05 4.20 1.00 3866 4034
##
## ~Symptom_Score (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.43 1.27 0.05 4.73 1.00 4241 4535
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept -4.93 3.64 -12.23 2.22 1.00 8723 4973
## N_Days -0.81 0.33 -1.46 -0.18 1.00 9938 6111
## Drug1 -0.22 0.23 -0.66 0.22 1.00 15755 5817
## Age 0.21 0.53 -0.85 1.26 1.00 11149 5334
## Sex1 0.77 0.77 -0.77 2.29 1.00 9251 6515
## Ascites1 1.04 1.64 -1.93 4.51 1.00 9567 5005
## Hepatomegaly1 -1.34 1.16 -3.64 0.87 1.00 6097 5342
## Spiders1 0.72 1.09 -1.41 2.89 1.00 6137 5343
## Edema1 6.31 3.33 1.92 14.52 1.00 6435 3403
## Bilirubin 1.45 0.66 0.23 2.80 1.00 9770 6607
## Cholesterol 0.06 0.30 -0.52 0.64 1.00 10602 6060
## Albumin 0.10 0.46 -0.81 1.01 1.00 7187 5962
## Copper -0.12 0.38 -0.89 0.62 1.00 8726 6369
## Alk_Phos 1.05 0.34 0.44 1.77 1.00 9376 5452
## SGOT 0.60 0.28 0.06 1.19 1.00 11463 5987
## Tryglicerides 0.48 0.33 -0.15 1.13 1.00 10733 6516
## Platelets 0.28 0.33 -0.36 0.93 1.00 9218 6580
## Prothrombin 0.90 0.30 0.33 1.51 1.00 11474 6101
## Stage2 2.02 1.55 -1.14 5.00 1.00 6775 5793
## Stage3 1.52 1.47 -1.39 4.48 1.00 7926 5350
## Stage4 1.57 1.48 -1.52 4.63 1.00 8572 5744
## thrombocytopenia1 1.17 0.95 -0.70 3.05 1.00 9944 6291
## elevated_alk_phos 0.99 2.67 -4.57 6.46 1.00 11119 3454
## normal_copper1 -0.51 0.58 -1.69 0.62 1.00 10143 6274
## normal_albumin1 0.22 0.76 -1.25 1.71 1.00 8744 7106
## Age_Group1 0.17 1.80 -3.38 3.77 1.00 8481 5564
## Age_Group2 2.09 1.70 -1.24 5.52 1.00 7476 5683
## Age_Group3 0.94 1.74 -2.54 4.42 1.00 8465 5457
## Symptom_Score1 1.56 1.44 -1.36 4.46 1.00 7046 4962
## Symptom_Score2 1.67 1.74 -1.76 5.31 1.00 6926 4956
## Symptom_Score3 -1.10 2.45 -6.66 2.99 1.00 8142 4905
## Diag_Month 0.86 0.31 0.27 1.48 1.00 11407 6400
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Now same model with different prior (*prior sensitivity analysis)
Model2 <- brm(Status ~ . + (1|Age_Group) + (1|Stage) + (1| Symptom_Score),
data = train_data,
family = bernoulli(link = "logit"),
prior = set_prior("student_t(3, 0, 2)", class = "b"),
control = list(adapt_delta = 0.98),
iter = 4000, warmup = 2000)
## Compiling Stan program...
## Start sampling
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 7.9e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.79 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 1: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 1: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 1: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 1: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 1: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 1: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 1: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 1: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 1: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 1: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 1: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 4.248 seconds (Warm-up)
## Chain 1: 3.598 seconds (Sampling)
## Chain 1: 7.846 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 1.8e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.18 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 2: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 2: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 2: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 2: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 2: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 2: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 2: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 2: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 2: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 2: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 2: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 4.303 seconds (Warm-up)
## Chain 2: 3.548 seconds (Sampling)
## Chain 2: 7.851 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 1.8e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.18 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 3: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 3: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 3: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 3: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 3: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 3: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 3: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 3: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 3: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 3: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 3: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 4.211 seconds (Warm-up)
## Chain 3: 3.427 seconds (Sampling)
## Chain 3: 7.638 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 1.6e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.16 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 4: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 4: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 4: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 4: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 4: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 4: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 4: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 4: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 4: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 4: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 4: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 4.007 seconds (Warm-up)
## Chain 4: 3.457 seconds (Sampling)
## Chain 4: 7.464 seconds (Total)
## Chain 4:
summary(Model2)
## Family: bernoulli
## Links: mu = logit
## Formula: Status ~ N_Days + Drug + Age + Sex + Ascites + Hepatomegaly + Spiders + Edema + Bilirubin + Cholesterol + Albumin + Copper + Alk_Phos + SGOT + Tryglicerides + Platelets + Prothrombin + Stage + thrombocytopenia + elevated_alk_phos + normal_copper + normal_albumin + Age_Group + Symptom_Score + Diag_Month + (1 | Age_Group) + (1 | Stage) + (1 | Symptom_Score)
## Data: train_data (Number of observations: 194)
## Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
## total post-warmup draws = 8000
##
## Group-Level Effects:
## ~Age_Group (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.40 1.25 0.05 4.75 1.00 5856 3933
##
## ~Stage (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.66 1.30 0.07 4.87 1.00 4737 4757
##
## ~Symptom_Score (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.59 1.37 0.06 5.05 1.00 4933 4718
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept -2.50 4.10 -10.23 5.69 1.00 8612 4175
## N_Days -0.75 0.31 -1.37 -0.16 1.00 11042 6428
## Drug1 0.98 0.53 -0.03 2.03 1.00 14578 6722
## Age 0.18 0.52 -0.88 1.21 1.00 12365 6010
## Sex1 0.58 0.77 -0.93 2.10 1.00 12135 6080
## Ascites1 0.60 1.70 -2.46 4.29 1.00 11254 4101
## Hepatomegaly1 -1.39 1.19 -3.97 0.76 1.00 7182 5432
## Spiders1 0.57 1.13 -1.69 2.81 1.00 6946 5916
## Edema1 8.38 5.33 2.34 22.08 1.00 5404 3068
## Bilirubin 1.59 0.67 0.36 2.98 1.00 10435 6451
## Cholesterol -0.04 0.31 -0.64 0.56 1.00 11779 6373
## Albumin 0.05 0.45 -0.83 0.94 1.00 9349 6462
## Copper -0.14 0.36 -0.88 0.55 1.00 8914 5716
## Alk_Phos 1.04 0.33 0.44 1.70 1.00 11028 6359
## SGOT 0.65 0.27 0.14 1.20 1.00 12885 6592
## Tryglicerides 0.51 0.33 -0.12 1.18 1.00 11876 6555
## Platelets 0.22 0.34 -0.45 0.89 1.00 10483 5984
## Prothrombin 0.94 0.30 0.37 1.55 1.00 11327 6435
## Stage2 1.19 1.73 -2.35 4.72 1.00 7744 5250
## Stage3 0.80 1.63 -2.56 3.99 1.00 8857 5341
## Stage4 0.78 1.67 -2.50 4.19 1.00 8867 5773
## thrombocytopenia1 0.66 0.96 -1.17 2.57 1.00 11219 5930
## elevated_alk_phos -0.01 3.19 -6.26 6.21 1.00 10300 3079
## normal_copper1 -0.69 0.58 -1.86 0.45 1.00 12884 6525
## normal_albumin1 0.20 0.75 -1.24 1.68 1.00 10464 5231
## Age_Group1 -0.66 1.71 -3.99 2.79 1.00 9703 5904
## Age_Group2 1.30 1.76 -2.06 4.95 1.00 8105 4334
## Age_Group3 -0.03 1.74 -3.60 3.50 1.00 9575 4871
## Symptom_Score1 1.06 1.64 -2.34 4.31 1.00 7625 4672
## Symptom_Score2 1.25 1.92 -2.31 5.38 1.00 7674 5309
## Symptom_Score3 -1.79 2.57 -7.81 2.50 1.00 8442 4482
## Diag_Month 0.90 0.31 0.31 1.53 1.00 11720 6297
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Now fit different model on the synthetic data without considering group effect but considering the first prior
Model3 <- brm(Status ~ .,
data = train_data,
family = bernoulli(link = "logit"),
prior = c(set_prior("student_t(4, 1, 2)", class = "b"),
set_prior("normal(-0.5, 0.25)", coef = "Drug1", class = "b")))
## Compiling Stan program...
## Start sampling
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 4.8e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.48 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 0.205 seconds (Warm-up)
## Chain 1: 0.19 seconds (Sampling)
## Chain 1: 0.395 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 9e-06 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 0.212 seconds (Warm-up)
## Chain 2: 0.192 seconds (Sampling)
## Chain 2: 0.404 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 9e-06 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 0.21 seconds (Warm-up)
## Chain 3: 0.21 seconds (Sampling)
## Chain 3: 0.42 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 9e-06 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 0.211 seconds (Warm-up)
## Chain 4: 0.185 seconds (Sampling)
## Chain 4: 0.396 seconds (Total)
## Chain 4:
summary(Model3)
## Family: bernoulli
## Links: mu = logit
## Formula: Status ~ N_Days + Drug + Age + Sex + Ascites + Hepatomegaly + Spiders + Edema + Bilirubin + Cholesterol + Albumin + Copper + Alk_Phos + SGOT + Tryglicerides + Platelets + Prothrombin + Stage + thrombocytopenia + elevated_alk_phos + normal_copper + normal_albumin + Age_Group + Symptom_Score + Diag_Month
## Data: train_data (Number of observations: 194)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept -4.96 3.51 -11.62 1.78 1.00 4192 2029
## N_Days -0.80 0.31 -1.43 -0.21 1.00 4634 3275
## Drug1 -0.22 0.23 -0.68 0.24 1.00 5841 2579
## Age 0.22 0.52 -0.81 1.22 1.00 4300 2834
## Sex1 0.74 0.76 -0.75 2.25 1.00 4891 3277
## Ascites1 0.90 1.55 -2.08 4.22 1.00 4365 2586
## Hepatomegaly1 -1.27 0.95 -3.24 0.53 1.00 3292 3110
## Spiders1 0.71 0.91 -1.06 2.49 1.00 3520 3060
## Edema1 5.94 3.09 1.80 13.58 1.00 3440 1846
## Bilirubin 1.39 0.66 0.15 2.70 1.00 4055 3115
## Cholesterol 0.05 0.29 -0.51 0.62 1.00 4770 2823
## Albumin 0.10 0.46 -0.82 0.97 1.00 3782 2858
## Copper -0.13 0.38 -0.88 0.59 1.00 4352 3061
## Alk_Phos 1.03 0.33 0.43 1.72 1.00 4261 3306
## SGOT 0.59 0.27 0.07 1.11 1.00 4784 3175
## Tryglicerides 0.47 0.32 -0.17 1.12 1.00 4856 3464
## Platelets 0.27 0.32 -0.35 0.90 1.00 3659 3115
## Prothrombin 0.85 0.29 0.31 1.44 1.00 4244 2846
## Stage2 2.34 1.09 0.34 4.66 1.00 3177 2304
## Stage3 1.64 1.06 -0.29 3.92 1.00 2779 2333
## Stage4 1.73 1.10 -0.36 4.05 1.00 3273 2164
## thrombocytopenia1 1.16 0.94 -0.67 2.99 1.00 4556 3166
## elevated_alk_phos 1.00 3.02 -5.00 6.81 1.00 4578 1552
## normal_copper1 -0.48 0.56 -1.59 0.63 1.00 4779 3194
## normal_albumin1 0.19 0.78 -1.37 1.71 1.00 3863 2813
## Age_Group1 -0.10 1.48 -3.07 2.78 1.00 3611 2627
## Age_Group2 2.42 1.35 -0.20 5.20 1.00 3322 2300
## Age_Group3 0.94 1.54 -2.12 4.02 1.00 3437 2695
## Symptom_Score1 1.65 0.92 -0.09 3.49 1.00 3205 2415
## Symptom_Score2 1.76 1.48 -1.04 4.76 1.00 2825 2732
## Symptom_Score3 -1.42 2.37 -6.76 2.62 1.00 3250 2278
## Diag_Month 0.84 0.30 0.26 1.47 1.00 5146 3391
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Comparing the 3 models using loo function
loo(Model1,Model2,Model3)
## Warning: Found 17 observations with a pareto_k > 0.7 in model 'Model1'. It is
## recommended to set 'moment_match = TRUE' in order to perform moment matching
## for problematic observations.
## Warning: Found 10 observations with a pareto_k > 0.7 in model 'Model2'. It is
## recommended to set 'moment_match = TRUE' in order to perform moment matching
## for problematic observations.
## Warning: Found 10 observations with a pareto_k > 0.7 in model 'Model3'. It is
## recommended to set 'moment_match = TRUE' in order to perform moment matching
## for problematic observations.
## Output of model 'Model1':
##
## Computed from 8000 by 194 log-likelihood matrix
##
## Estimate SE
## elpd_loo -97.7 11.5
## p_loo 34.5 5.2
## looic 195.4 22.9
## ------
## Monte Carlo SE of elpd_loo is NA.
##
## Pareto k diagnostic values:
## Count Pct. Min. n_eff
## (-Inf, 0.5] (good) 154 79.4% 1467
## (0.5, 0.7] (ok) 23 11.9% 397
## (0.7, 1] (bad) 17 8.8% 90
## (1, Inf) (very bad) 0 0.0% <NA>
## See help('pareto-k-diagnostic') for details.
##
## Output of model 'Model2':
##
## Computed from 8000 by 194 log-likelihood matrix
##
## Estimate SE
## elpd_loo -93.5 10.4
## p_loo 32.7 4.5
## looic 186.9 20.7
## ------
## Monte Carlo SE of elpd_loo is NA.
##
## Pareto k diagnostic values:
## Count Pct. Min. n_eff
## (-Inf, 0.5] (good) 164 84.5% 913
## (0.5, 0.7] (ok) 20 10.3% 330
## (0.7, 1] (bad) 9 4.6% 99
## (1, Inf) (very bad) 1 0.5% 8013
## See help('pareto-k-diagnostic') for details.
##
## Output of model 'Model3':
##
## Computed from 4000 by 194 log-likelihood matrix
##
## Estimate SE
## elpd_loo -96.7 11.4
## p_loo 32.9 5.2
## looic 193.3 22.8
## ------
## Monte Carlo SE of elpd_loo is NA.
##
## Pareto k diagnostic values:
## Count Pct. Min. n_eff
## (-Inf, 0.5] (good) 163 84.0% 600
## (0.5, 0.7] (ok) 21 10.8% 164
## (0.7, 1] (bad) 7 3.6% 3351
## (1, Inf) (very bad) 3 1.5% 38
## See help('pareto-k-diagnostic') for details.
##
## Model comparisons:
## elpd_diff se_diff
## Model2 0.0 0.0
## Model3 -3.2 3.0
## Model1 -4.2 3.0
Model diagnostics(mcmc_trace evaluation)
mcmc_plot(Model1, type = "trace")
## No divergences to plot.
mcmc_plot(Model2, type = "trace")
## No divergences to plot.
mcmc_plot(Model3, type = "trace")
## No divergences to plot.
Model evaluation on the test split(prior sentivity analysis and sensitivity)
test_predictions <- predict(Model1, newdata = test_data, type = "response")
predicted_classes <- ifelse(test_predictions > 0.5, 1, 0)
predicted_classes_Model1 <- as.data.frame(predicted_classes)
conf_matrix <- table(predicted_classes_Model1$Estimate, test_data$Status)
TP <- conf_matrix[2, 2]
FP <- conf_matrix[1, 2]
TN <- conf_matrix[1, 1]
FN <- conf_matrix[2, 1]
specificity <- TN / (TN + FP)
sensitivity <- TP / (TP + FN)
test_predictions_Model2 <- predict(Model2, newdata = test_data, type = "response")
predicted_classes_Model2 <- ifelse(test_predictions_Model2 > 0.5, 1, 0)
predicted_classes_Model2 <- as.data.frame(predicted_classes_Model2)
conf_matrix_Model2 <- table(predicted_classes_Model2$Estimate, test_data$Status)
TP_Model2 <- conf_matrix_Model2[2, 2]
FP_Model2 <- conf_matrix_Model2[1, 2]
TN_Model2 <- conf_matrix_Model2[1, 1]
FN_Model2 <- conf_matrix_Model2[2, 1]
specificity_Model2 <- TN_Model2 / (TN_Model2 + FP_Model2)
sensitivity_Model2 <- TP_Model2 / (TP_Model2 + FN_Model2)
test_predictions_Model3 <- predict(Model3, newdata = test_data, type = "response")
predicted_classes_Model3 <- ifelse(test_predictions_Model3 > 0.5, 1, 0)
predicted_classes_Model3 <- as.data.frame(predicted_classes_Model3)
conf_matrix_Model3 <- table(predicted_classes_Model3$Estimate, test_data$Status)
TP_Model3 <- conf_matrix_Model3[2, 2]
FP_Model3 <- conf_matrix_Model3[1, 2]
TN_Model3 <- conf_matrix_Model3[1, 1]
FN_Model3 <- conf_matrix_Model3[2, 1]
specificity_Model3 <- TN_Model3 / (TN_Model3 + FP_Model3)
sensitivity_Model3 <- TP_Model3 / (TP_Model3 + FN_Model3)
comparison_results <- data.frame(Model = c("Model1", "Model2", "Model3"),
Specificity = c(specificity, specificity_Model2, specificity_Model3),
Sensitivity = c(sensitivity, sensitivity_Model2, sensitivity_Model3))
print(comparison_results)
## Model Specificity Sensitivity
## 1 Model1 0.7800000 0.6875000
## 2 Model2 0.7708333 0.6470588
## 3 Model3 0.7600000 0.6562500
Stan Code for each of the models
stancode(Model1)
## // generated with brms 2.20.4
## functions {
## }
## data {
## int<lower=1> N; // total number of observations
## array[N] int Y; // response variable
## int<lower=1> K; // number of population-level effects
## matrix[N, K] X; // population-level design matrix
## int<lower=1> Kc; // number of population-level effects after centering
## // data for group-level effects of ID 1
## int<lower=1> N_1; // number of grouping levels
## int<lower=1> M_1; // number of coefficients per level
## array[N] int<lower=1> J_1; // grouping indicator per observation
## // group-level predictor values
## vector[N] Z_1_1;
## // data for group-level effects of ID 2
## int<lower=1> N_2; // number of grouping levels
## int<lower=1> M_2; // number of coefficients per level
## array[N] int<lower=1> J_2; // grouping indicator per observation
## // group-level predictor values
## vector[N] Z_2_1;
## // data for group-level effects of ID 3
## int<lower=1> N_3; // number of grouping levels
## int<lower=1> M_3; // number of coefficients per level
## array[N] int<lower=1> J_3; // grouping indicator per observation
## // group-level predictor values
## vector[N] Z_3_1;
## int prior_only; // should the likelihood be ignored?
## }
## transformed data {
## matrix[N, Kc] Xc; // centered version of X without an intercept
## vector[Kc] means_X; // column means of X before centering
## for (i in 2:K) {
## means_X[i - 1] = mean(X[, i]);
## Xc[, i - 1] = X[, i] - means_X[i - 1];
## }
## }
## parameters {
## vector[Kc] b; // regression coefficients
## real Intercept; // temporary intercept for centered predictors
## vector<lower=0>[M_1] sd_1; // group-level standard deviations
## array[M_1] vector[N_1] z_1; // standardized group-level effects
## vector<lower=0>[M_2] sd_2; // group-level standard deviations
## array[M_2] vector[N_2] z_2; // standardized group-level effects
## vector<lower=0>[M_3] sd_3; // group-level standard deviations
## array[M_3] vector[N_3] z_3; // standardized group-level effects
## }
## transformed parameters {
## vector[N_1] r_1_1; // actual group-level effects
## vector[N_2] r_2_1; // actual group-level effects
## vector[N_3] r_3_1; // actual group-level effects
## real lprior = 0; // prior contributions to the log posterior
## r_1_1 = (sd_1[1] * (z_1[1]));
## r_2_1 = (sd_2[1] * (z_2[1]));
## r_3_1 = (sd_3[1] * (z_3[1]));
## lprior += student_t_lpdf(b[1] | 4, 1, 2);
## lprior += normal_lpdf(b[2] | -0.5,0.25);
## lprior += student_t_lpdf(b[3] | 4, 1, 2);
## lprior += student_t_lpdf(b[4] | 4, 1, 2);
## lprior += student_t_lpdf(b[5] | 4, 1, 2);
## lprior += student_t_lpdf(b[6] | 4, 1, 2);
## lprior += student_t_lpdf(b[7] | 4, 1, 2);
## lprior += student_t_lpdf(b[8] | 4, 1, 2);
## lprior += student_t_lpdf(b[9] | 4, 1, 2);
## lprior += student_t_lpdf(b[10] | 4, 1, 2);
## lprior += student_t_lpdf(b[11] | 4, 1, 2);
## lprior += student_t_lpdf(b[12] | 4, 1, 2);
## lprior += student_t_lpdf(b[13] | 4, 1, 2);
## lprior += student_t_lpdf(b[14] | 4, 1, 2);
## lprior += student_t_lpdf(b[15] | 4, 1, 2);
## lprior += student_t_lpdf(b[16] | 4, 1, 2);
## lprior += student_t_lpdf(b[17] | 4, 1, 2);
## lprior += student_t_lpdf(b[18] | 4, 1, 2);
## lprior += student_t_lpdf(b[19] | 4, 1, 2);
## lprior += student_t_lpdf(b[20] | 4, 1, 2);
## lprior += student_t_lpdf(b[21] | 4, 1, 2);
## lprior += student_t_lpdf(b[22] | 4, 1, 2);
## lprior += student_t_lpdf(b[23] | 4, 1, 2);
## lprior += student_t_lpdf(b[24] | 4, 1, 2);
## lprior += student_t_lpdf(b[25] | 4, 1, 2);
## lprior += student_t_lpdf(b[26] | 4, 1, 2);
## lprior += student_t_lpdf(b[27] | 4, 1, 2);
## lprior += student_t_lpdf(b[28] | 4, 1, 2);
## lprior += student_t_lpdf(b[29] | 4, 1, 2);
## lprior += student_t_lpdf(b[30] | 4, 1, 2);
## lprior += student_t_lpdf(b[31] | 4, 1, 2);
## lprior += student_t_lpdf(Intercept | 3, 0, 2.5);
## lprior += student_t_lpdf(sd_1 | 3, 0, 2.5)
## - 1 * student_t_lccdf(0 | 3, 0, 2.5);
## lprior += student_t_lpdf(sd_2 | 3, 0, 2.5)
## - 1 * student_t_lccdf(0 | 3, 0, 2.5);
## lprior += student_t_lpdf(sd_3 | 3, 0, 2.5)
## - 1 * student_t_lccdf(0 | 3, 0, 2.5);
## }
## model {
## // likelihood including constants
## if (!prior_only) {
## // initialize linear predictor term
## vector[N] mu = rep_vector(0.0, N);
## mu += Intercept;
## for (n in 1:N) {
## // add more terms to the linear predictor
## mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_2_1[J_2[n]] * Z_2_1[n] + r_3_1[J_3[n]] * Z_3_1[n];
## }
## target += bernoulli_logit_glm_lpmf(Y | Xc, mu, b);
## }
## // priors including constants
## target += lprior;
## target += std_normal_lpdf(z_1[1]);
## target += std_normal_lpdf(z_2[1]);
## target += std_normal_lpdf(z_3[1]);
## }
## generated quantities {
## // actual population-level intercept
## real b_Intercept = Intercept - dot_product(means_X, b);
## }
stancode(Model2)
## // generated with brms 2.20.4
## functions {
## }
## data {
## int<lower=1> N; // total number of observations
## array[N] int Y; // response variable
## int<lower=1> K; // number of population-level effects
## matrix[N, K] X; // population-level design matrix
## int<lower=1> Kc; // number of population-level effects after centering
## // data for group-level effects of ID 1
## int<lower=1> N_1; // number of grouping levels
## int<lower=1> M_1; // number of coefficients per level
## array[N] int<lower=1> J_1; // grouping indicator per observation
## // group-level predictor values
## vector[N] Z_1_1;
## // data for group-level effects of ID 2
## int<lower=1> N_2; // number of grouping levels
## int<lower=1> M_2; // number of coefficients per level
## array[N] int<lower=1> J_2; // grouping indicator per observation
## // group-level predictor values
## vector[N] Z_2_1;
## // data for group-level effects of ID 3
## int<lower=1> N_3; // number of grouping levels
## int<lower=1> M_3; // number of coefficients per level
## array[N] int<lower=1> J_3; // grouping indicator per observation
## // group-level predictor values
## vector[N] Z_3_1;
## int prior_only; // should the likelihood be ignored?
## }
## transformed data {
## matrix[N, Kc] Xc; // centered version of X without an intercept
## vector[Kc] means_X; // column means of X before centering
## for (i in 2:K) {
## means_X[i - 1] = mean(X[, i]);
## Xc[, i - 1] = X[, i] - means_X[i - 1];
## }
## }
## parameters {
## vector[Kc] b; // regression coefficients
## real Intercept; // temporary intercept for centered predictors
## vector<lower=0>[M_1] sd_1; // group-level standard deviations
## array[M_1] vector[N_1] z_1; // standardized group-level effects
## vector<lower=0>[M_2] sd_2; // group-level standard deviations
## array[M_2] vector[N_2] z_2; // standardized group-level effects
## vector<lower=0>[M_3] sd_3; // group-level standard deviations
## array[M_3] vector[N_3] z_3; // standardized group-level effects
## }
## transformed parameters {
## vector[N_1] r_1_1; // actual group-level effects
## vector[N_2] r_2_1; // actual group-level effects
## vector[N_3] r_3_1; // actual group-level effects
## real lprior = 0; // prior contributions to the log posterior
## r_1_1 = (sd_1[1] * (z_1[1]));
## r_2_1 = (sd_2[1] * (z_2[1]));
## r_3_1 = (sd_3[1] * (z_3[1]));
## lprior += student_t_lpdf(b | 3, 0, 2);
## lprior += student_t_lpdf(Intercept | 3, 0, 2.5);
## lprior += student_t_lpdf(sd_1 | 3, 0, 2.5)
## - 1 * student_t_lccdf(0 | 3, 0, 2.5);
## lprior += student_t_lpdf(sd_2 | 3, 0, 2.5)
## - 1 * student_t_lccdf(0 | 3, 0, 2.5);
## lprior += student_t_lpdf(sd_3 | 3, 0, 2.5)
## - 1 * student_t_lccdf(0 | 3, 0, 2.5);
## }
## model {
## // likelihood including constants
## if (!prior_only) {
## // initialize linear predictor term
## vector[N] mu = rep_vector(0.0, N);
## mu += Intercept;
## for (n in 1:N) {
## // add more terms to the linear predictor
## mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_2_1[J_2[n]] * Z_2_1[n] + r_3_1[J_3[n]] * Z_3_1[n];
## }
## target += bernoulli_logit_glm_lpmf(Y | Xc, mu, b);
## }
## // priors including constants
## target += lprior;
## target += std_normal_lpdf(z_1[1]);
## target += std_normal_lpdf(z_2[1]);
## target += std_normal_lpdf(z_3[1]);
## }
## generated quantities {
## // actual population-level intercept
## real b_Intercept = Intercept - dot_product(means_X, b);
## }
stancode(Model3)
## // generated with brms 2.20.4
## functions {
## }
## data {
## int<lower=1> N; // total number of observations
## array[N] int Y; // response variable
## int<lower=1> K; // number of population-level effects
## matrix[N, K] X; // population-level design matrix
## int<lower=1> Kc; // number of population-level effects after centering
## int prior_only; // should the likelihood be ignored?
## }
## transformed data {
## matrix[N, Kc] Xc; // centered version of X without an intercept
## vector[Kc] means_X; // column means of X before centering
## for (i in 2:K) {
## means_X[i - 1] = mean(X[, i]);
## Xc[, i - 1] = X[, i] - means_X[i - 1];
## }
## }
## parameters {
## vector[Kc] b; // regression coefficients
## real Intercept; // temporary intercept for centered predictors
## }
## transformed parameters {
## real lprior = 0; // prior contributions to the log posterior
## lprior += student_t_lpdf(b[1] | 4, 1, 2);
## lprior += normal_lpdf(b[2] | -0.5, 0.25);
## lprior += student_t_lpdf(b[3] | 4, 1, 2);
## lprior += student_t_lpdf(b[4] | 4, 1, 2);
## lprior += student_t_lpdf(b[5] | 4, 1, 2);
## lprior += student_t_lpdf(b[6] | 4, 1, 2);
## lprior += student_t_lpdf(b[7] | 4, 1, 2);
## lprior += student_t_lpdf(b[8] | 4, 1, 2);
## lprior += student_t_lpdf(b[9] | 4, 1, 2);
## lprior += student_t_lpdf(b[10] | 4, 1, 2);
## lprior += student_t_lpdf(b[11] | 4, 1, 2);
## lprior += student_t_lpdf(b[12] | 4, 1, 2);
## lprior += student_t_lpdf(b[13] | 4, 1, 2);
## lprior += student_t_lpdf(b[14] | 4, 1, 2);
## lprior += student_t_lpdf(b[15] | 4, 1, 2);
## lprior += student_t_lpdf(b[16] | 4, 1, 2);
## lprior += student_t_lpdf(b[17] | 4, 1, 2);
## lprior += student_t_lpdf(b[18] | 4, 1, 2);
## lprior += student_t_lpdf(b[19] | 4, 1, 2);
## lprior += student_t_lpdf(b[20] | 4, 1, 2);
## lprior += student_t_lpdf(b[21] | 4, 1, 2);
## lprior += student_t_lpdf(b[22] | 4, 1, 2);
## lprior += student_t_lpdf(b[23] | 4, 1, 2);
## lprior += student_t_lpdf(b[24] | 4, 1, 2);
## lprior += student_t_lpdf(b[25] | 4, 1, 2);
## lprior += student_t_lpdf(b[26] | 4, 1, 2);
## lprior += student_t_lpdf(b[27] | 4, 1, 2);
## lprior += student_t_lpdf(b[28] | 4, 1, 2);
## lprior += student_t_lpdf(b[29] | 4, 1, 2);
## lprior += student_t_lpdf(b[30] | 4, 1, 2);
## lprior += student_t_lpdf(b[31] | 4, 1, 2);
## lprior += student_t_lpdf(Intercept | 3, 0, 2.5);
## }
## model {
## // likelihood including constants
## if (!prior_only) {
## target += bernoulli_logit_glm_lpmf(Y | Xc, Intercept, b);
## }
## // priors including constants
## target += lprior;
## }
## generated quantities {
## // actual population-level intercept
## real b_Intercept = Intercept - dot_product(means_X, b);
## }